A. 16S analysis

Network analysis on relative abundances

In the first analysis we utilize the raw counts. As such this network will be built using relative abundance data.

# Set seed
set.seed(777)

# Import data
otus <- as.matrix(read.csv2("16S_data/OTU_table_raw.csv", fill = TRUE, header = TRUE,
                    row.names = 1))

taxa <- tax_table(as.matrix(read.csv2("16S_data/tax_table_raw.csv", fill = TRUE, header = TRUE,
                    row.names = 1))
)

phy_df <- phyloseq(otu_table(otus, taxa_are_rows = FALSE), taxa)

# filterobj <- filterTaxonMatrix(otus, minocc = 20,
#                               keepSum = TRUE, return.filtered.indices = TRUE)
# otus.f <- filterobj$mat
# taxa.f <- taxa[setdiff(1:nrow(taxa), filterobj$filtered.indices),]
# dummyTaxonomy <- colnames(tax_df); dummyTaxonomy[1] <- "Kingdom_dummy"
# taxa.f <- rbind(taxa.f, dummyTaxonomy)
# rownames(taxa.f)[nrow(taxa.f)] <- "0"
# rownames(otus.f)[nrow(otus.f)] <- "0"
# 
# # Next, we assemble a new phyloseq object with the filtered OTU and taxonomy tables.
# updatedotus <- otu_table(otus.f, taxa_are_rows = TRUE)
# updatedtaxa <- tax_table(taxa.f)
# phyloseqobj.f <- phyloseq(updatedotus, updatedtaxa)

# Prevalence filtering
phy_df_filtered <- filter_taxa(phy_df, function(x) sum(x > 30) > (0.25*length(x)), TRUE)

sp_easi <- spiec.easi(phy_df_filtered, method='mb', lambda.min.ratio=1e-2,
                           nlambda=20, icov.select.params=list(rep.num=50))
## Normalizing/clr transformation of data with pseudocount ...
## Inverse Covariance Estimation with mb ...
## Model selection with stars ...
## Done!
ig.mb <- adj2igraph(sp_easi$refit,  vertex.attr = list(name=taxa_names(phy_df_filtered)))
vsize <- Biobase::rowMedians(clr(otu_table(phy_df_filtered), 1))+15
Lineage_rel <- tax_table(phy_df_filtered)[,"Lineage"]
Lineage_rel <- factor(Lineage_rel, levels = unique(Lineage_rel))
vweights <- summary(symBeta(getOptBeta(sp_easi), mode='maxabs'))
MAGs <- c(); MAGs[taxa_names(phy_df_filtered)=="Otu00001"]  <- "Ramlibacter MAG"
MAGs[taxa_names(phy_df_filtered)=="Otu00002"]  <- "Bacteroidetes sp. MAG1"
MAGs[taxa_names(phy_df_filtered)=="Otu00003"]  <- "Bacteroidetes sp. MAG2"
MAGs[is.na(MAGs)] <- ""

# png(file = "./Figures/Figures_network/NETWORK-REL-CX-C30-A25.png", width = 9, height = 9, res = 500, units = "in")
plot_network_custom(ig.mb, phy_df_filtered, type='taxa',
             line_weight = 2, hjust = 0.5,
             point_size = 0.1, alpha = 0.01, label_size = 3.95)+
  # scale_fill_brewer(palette = "Paired")+
  # scale_color_brewer(palette = "Paired")+
  scale_fill_manual(values = c("#e2a2fd", brewer.pal(n = 12, "Paired")[c(3:8,1:2,11,12)]) )+
  geom_point(aes(size = vsize, fill = Lineage_rel), alpha = 0.5,
             colour="black", shape=21)+
  guides(size = FALSE,
    fill  = guide_legend(title = "Lineage", override.aes = list(size = 5),
                         nrow = 4),
    color = FALSE)+
  theme(legend.position="bottom", legend.text=element_text(size=12),
        text = element_text(size = 12),
        plot.margin = unit(c(1,1,1,1), "cm"))+
  scale_size(range = c(5, 15))+
  geom_label_repel(aes(label = MAGs), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = 'black',
                   size = 4,
                       # Width of the line segments.
                   segment.size = 1.5,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.015, 'npc')),
                   nudge_x = -0.1,
                   nudge_y = 0.6
  )

# dev.off()

Network analysis on absolute abundances

The second network will be built using absolute abundance data by multiplying the relative taxon abundances by the total cell density. The final obtained counts will expressed as nr. of cells measured in 50 µL samples. We also only consider the OTUs that were left after the prevalence filtering conducted in the network construction with relative abundance data.

# Import cell count data
cell_counts <- read.csv("16S_data/cell_counts.csv")
cell_counts$sample_title <- as.factor(cell_counts$sample_title)

# Import metadata
meta_16S <- read.csv("16s_data/Metadata.csv")[1:77,]; rownames(meta_16S) <- meta_16S$sample_title

# Calculate proportions
phy_df_rel <- transform_sample_counts(phy_df, function(x) x/sum(x))

# Add metadata
sample_data(phy_df_rel) <- sample_data(meta_16S)

# Select samples for which corresponding counts are available
cell_counts <- cell_counts[cell_counts$sample_title %in% sample_names(phy_df_rel), ]
cell_counts <- droplevels(cell_counts)
phy_df_rel <- prune_samples(sample_names(phy_df_rel) %in% cell_counts$sample_title, phy_df_rel)

# Multiply with cell counts in 50 µL of sample
otu_table(phy_df_rel) <- otu_table(phy_df_rel) * cell_counts$Number_of_cells

# Select taxa that were selected based on prevalence in previous chunk
phy_df_abs <- prune_taxa(taxa_names(phy_df_filtered), phy_df_rel)

# Round absolute abundances to integers
otu_table(phy_df_abs) <- round(otu_table(phy_df_abs), 0)

# Construct network
sp_easi_abs <- spiec.easi(phy_df_abs, method='mb', lambda.min.ratio=1e-2,
                           nlambda=20, icov.select.params=list(rep.num=50))
## Normalizing/clr transformation of data with pseudocount ...
## Inverse Covariance Estimation with mb ...
## Model selection with stars ...
## Done!
ig.mb_abs <- adj2igraph(sp_easi_abs$refit,  vertex.attr = list(name=taxa_names(phy_df_abs)))
vsize_abs <- Biobase::rowMedians(clr(otu_table(phy_df_abs), 1))+15
Lineage_abs <- tax_table(phy_df_abs)[,"Lineage"]
Lineage_abs <- factor(Lineage_abs, levels = unique(Lineage_abs))
vweights_abs <- summary(symBeta(getOptBeta(sp_easi_abs), mode='maxabs'))
MAGs <- c(); MAGs[taxa_names(phy_df_abs)=="Otu00001"]  <- "Ramlibacter sp. MAG"
MAGs[taxa_names(phy_df_abs)=="Otu00002"]  <- "Bacteroidetes sp. MAG1"
MAGs[taxa_names(phy_df_abs)=="Otu00003"]  <- "Bacteroidetes sp. MAG2"
MAGs[is.na(MAGs)] <-""

# Plot network inferred from absolute abundances
# png(file = "./Figures/Figures_network/NETWORK-ABS-CX-C30-A25.png", width = 9, height = 9, res = 500, units = "in")
plot_network_custom(ig.mb_abs, phy_df_abs, type='taxa',
             line_weight = 2, hjust = 0.5,
             point_size = 0.1, alpha = 0.01, label_size = 3.95)+
  # scale_fill_brewer(palette = "Paired")+
  # scale_color_brewer(palette = "Paired")+
  scale_fill_manual(values = c("#e2a2fd", brewer.pal(n = 12, "Paired")[c(3:8,1:2,11,12)]) )+
  geom_point(aes(size = vsize_abs, fill = Lineage_abs), alpha = 0.5,
             colour="black", shape=21)+
  guides(size = FALSE,
    fill  = guide_legend(title = "Lineage", override.aes = list(size = 5),
                         nrow = 4),
    color = FALSE)+
  theme(legend.position="bottom", legend.text=element_text(size=12),
        text = element_text(size = 12),
        plot.margin = unit(c(1,1,1,1), "cm"))+
  scale_size(range = c(5, 15))+
  geom_label_repel(aes(label = MAGs), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = 'black',
                   size = 4,
                       # Width of the line segments.
                   segment.size = 1.5,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.015, 'npc')),
                   nudge_x = -0.1,
                   nudge_y = 0.6
  )

# dev.off()
# Plot absolute OTU dynamics of OTU1
df_abs <- psmelt(phy_df_abs)

p_abs_otu1 <- df_abs %>% dplyr::filter(OTU == "Otu00001") %>% 
  ggplot(aes(x = Timepoint, y = Abundance))+
  facet_grid(.~Reactor.cycle)+
  scale_shape_manual(values = c(21,24))+
  geom_line(size = 1.5, linetype = 2, color = adjustcolor("#000000", 0.5))+
  geom_point(size = 4, fill = "#e2a2fd", aes(shape = Reactor_status), alpha = 0.8,
             color = "black")+
  theme_bw()+
  ylab(expression("Otu00001 abundance - cells mL"^"-1"))+
  xlab("Time relative to reactor start - days")+
  scale_y_continuous(breaks = seq(0,50e3,5e3), limits = c(0,30e3))+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        strip.text.x=element_text(size=18),
        legend.position = "top")+
  guides(shape = guide_legend(title="Reactor status", ncol =1))

print(p_abs_otu1)

B. MetaG analysis

# Read data
mean_coverage <- read.table("./SAMPLES-SUMMARY/bins_across_samples/mean_coverage.txt", header = TRUE)
std_coverage <- read.table("./SAMPLES-SUMMARY/bins_across_samples/std_coverage.txt", header = TRUE)
bin_size <- read.table("./SAMPLES-SUMMARY/general_bins_summary.txt", header = TRUE)[, c(1, 3, 6, 9)]
total_reads <- read.table("./sample_reads.tsv", header = TRUE)
read_length <- 300

# From wide to long format
mean_coverage_long <- gather(mean_coverage, Sample_ID, coverage, 
                             SAMPLE_16:SAMPLE_65, factor_key=TRUE)

std_coverage_long <- gather(std_coverage, Sample_ID, std_coverage, 
                            SAMPLE_16:SAMPLE_65, 
                            factor_key=TRUE)

coverage_data <- data.frame(mean_coverage_long, 
                            std_coverage = std_coverage_long[,3])

# Read and add metadata
# meta <- read.csv2("metadata.csv")
# meta$Sample_ID <- gsub(meta$Sample_ID, pattern = ".", replacement = "_", fixed = TRUE)
data_total <- left_join(coverage_data, total_reads, by = "Sample_ID")
data_total <- left_join(data_total, bin_size, by = "bins")
# data_total <- left_join(data_total, meta, by =  "Sample_ID")
data_total$bins <- plyr::revalue(data_total$bins, c("BetIa_bin"="Limnohabitans_bin", "bacIa_vizbin1"="Bacteroidetes_bin1",
                                              "bacIa_vizbin2"="Bacteroidetes_bin2"))
# Calculate relative abundance of the bins
data_total$mean_rel_abundance <- 100*(data_total$coverage*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$upper_rel_abundance <- 100*((data_total$coverage+data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$lower_rel_abundance <- 100*((data_total$coverage-data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)

data_total$mean_rel_abundance_map <- 100*(data_total$coverage*data_total$bin_size)/(read_length*data_total$Mapped_reads)
data_total$upper_rel_abundance_map <- 100*((data_total$coverage+data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Mapped_reads)
data_total$lower_rel_abundance_map <- 100*((data_total$coverage-data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Mapped_reads)

# Add additional column that assigns PNCs to correct MAG
data_total$Genome_id <- factor(rep(c("Limnohabitans", "Limnohabitans", "Limnohabitans", "Bacteroidetes MAG2", "Bacteroidetes MAG1", "Bacteroidetes MAG2"), 4))

# Plot genome size for all three genomes
data_total[data_total$bins %in% c("Bacteroidetes_bin1",
                                "Bacteroidetes_bin2","Limnohabitans_bin"), ] %>% 
  ggplot(aes(x = bins, y = bin_size/(4*1e6), fill = Genome_id))+
  theme_bw()+
  geom_bar(width = 0.5, stat="identity", alpha = 0.7)+
  coord_flip()+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme(axis.text.x = element_text(angle = 0, size = 16),
        axis.text.y = element_text(size = 16),
        axis.title.x = element_text(size = 18),
        legend.title = element_text("Genome bin"), legend.position = "top")+
  xlab("")+
  ylab("Genome size (Mbp)")+
  guides(fill = FALSE)+
  geom_hline(yintercept = 1, size = 2, linetype="dotted")

# Plot irep for all 3 genomes
data_total[data_total$bins %in% c("Bacteroidetes_bin1",
                                "Bacteroidetes_bin2","Limnohabitans_bin"), ] %>% 
  ggplot(aes(x = bins, y = mean_irep/4, fill = Genome_id))+
  theme_bw()+
  geom_bar(width = 0.5, stat="identity", alpha = 0.7)+
  coord_flip()+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme(axis.text.x = element_text(angle = 0, size = 16),
        axis.text.y = element_text(size = 16),
        axis.title.x = element_text(size = 18),
        legend.title = element_text(""), legend.position = "top")+
  xlab("")+
  ylab("Index of Replication (iRep)")+
  guides(fill = FALSE)+
  geom_hline(yintercept = 1.34, size = 2, linetype="dotted")

1. Phylogenetic tree

Limnohabitans

Bacteroidetes

2. Investigate MAG- and 16S-based abundances

It is clear that there is significant %GC coverage bias present. The estimated relative abundances from metagenomics do not quantitatively match with the V3-V4 16S rRNA gene amplicon data. \[Relative\ abundance =100*(\frac{mean\ coverage * bin\ size}{read\ length*total\ sample\ reads })\] Another option is to calculate relative to mapped number of reads: \[Relative\ abundance =100*(\frac{mean\ coverage * bin\ size}{read\ length*total\ sample\ reads * \%mapped\ reads})\]

Import reference relative abundances from 16S data set in order to directly compare with metagenomic data set.

df_16S <- read.table("relative_abundance_16S.tsv", header = TRUE)
df_16S_long <- gather(df_16S, Sample_ID, relative_abundance_16S, 
                             SAMPLE_16:SAMPLE_65, factor_key=TRUE)
# Subset for only the three complete genomes (not PNCs).
data_total_sb <- data_total[data_total$bins %in% c("Limnohabitans_bin", "Bacteroidetes_bin1", "Bacteroidetes_bin2"),]

p_meta <- ggplot(data = data_total_sb, aes(x = bins, y = mean_rel_abundance, fill = bins))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  geom_errorbar(aes(ymin=lower_rel_abundance, 
                    ymax=upper_rel_abundance, 
                    width=.1))+
  facet_grid(.~Sample_ID)+
  # ylim(0,1)+ 
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Mean relative abundance (%)")+
  ylim(-1,100)+
  ggtitle("Metagenomic - total reads")

# Corrected for mapped N° of reads
p_meta_mapped <- ggplot(data = data_total_sb, aes(x = bins, y = mean_rel_abundance_map, fill = bins))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  geom_errorbar(aes(ymin=lower_rel_abundance_map, 
                    ymax=upper_rel_abundance_map, 
                    width=.1))+
  facet_grid(.~Sample_ID)+
  # ylim(0,1)+ 
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Mean relative abundance (%)")+
  ylim(-1,100)+
  ggtitle("Metagenomic - mapped reads")

p_16S <- ggplot(data = df_16S_long, aes(x = bins, y = relative_abundance_16S, fill = bins))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  facet_grid(.~Sample_ID)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Mean relative abundance (%)")+
  ylim(-1,100)+
  ggtitle("V3-V4 16S")

grid_arrange_shared_legend(p_meta, p_meta_mapped, p_16S, ncol = 1, nrow = 3)

3. Investigate sequence characteristics within coding DNA sequences (CDS)

# First we need the files that map the gene ID to the sequence ID (linux cmd: https://github.com/rprops/MetaG_lakeMI/wiki/11.-Genome-annotation)
# These are stored in the IMG_annotation data for each genome bin

# Next, extract the %GC of each gene from the gff file
extract_gc_from_gff("./IMG_annotation/IMG_2724679690_Limnohabitans/121950.assembled.gff", outputFolder = "GC_analysis")
extract_gc_from_gff("./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/121951.assembled.gff", outputFolder = "GC_analysis")
extract_gc_from_gff("./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/121960.assembled.gff", outputFolder = "GC_analysis")

# Use these files to make dataframes mapping function (COGs/Pfams/KO) and %GC
LIMNO_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.cog.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:29 2017  --- There are 2830 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017  --- The 10 genes with the highest GC% are:
##      function_id                                             function_name
## 2821     COG0405                              Gamma-glutamyltranspeptidase
## 2822     COG2755                  Lysophospholipase L1 or related esterase
## 2823     COG0642                      Signal transduction histidine kinase
## 2824     COG1514                                          2'-5' RNA ligase
## 2825     COG1767                Triphosphoribosyl-dephospho-CoA synthetase
## 2826     COG1261         Flagella basal body P-ring formation protein FlgA
## 2827     COG0665                Glycine/D-amino acid oxidase (deaminating)
## 2828     COG0810 Periplasmic protein TonB, links inner and outer membranes
## 2829     COG1040                 Predicted amidophosphoribosyltransferases
## 2830     COG1240                                 Mg-chelatase subunit ChlD
##        GC
## 2821 78.5
## 2822 78.5
## 2823 78.5
## 2824 78.5
## 2825 78.6
## 2826 78.6
## 2827 79.1
## 2828 79.5
## 2829 79.7
## 2830 80.6
BAC1_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.cog.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:29 2017  --- There are 1889 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017  --- The 10 genes with the highest GC% are:
##      function_id
## 1880     COG0052
## 1881     COG0183
## 1882     COG0629
## 1883     COG0509
## 1884     COG1734
## 1885     COG0636
## 1886     COG3502
## 1887     COG1765
## 1888     COG0377
## 1889     COG4104
##                                                                               function_name
## 1880                                                                   Ribosomal protein S2
## 1881                                                           Acetyl-CoA acetyltransferase
## 1882                                                    Single-stranded DNA-binding protein
## 1883                                    Glycine cleavage system H protein (lipoate-binding)
## 1884                                       RNA polymerase-binding transcription factor DksA
## 1885 FoF1-type ATP synthase, membrane subunit c/Archaeal/vacuolar-type H+-ATPase, subunit K
## 1886                                       Uncharacterized conserved protein, DUF952 family
## 1887                                                   Uncharacterized OsmC-related protein
## 1888 NADH:ubiquinone oxidoreductase 20 kD subunit (chhain B) or related Fe-S oxidoreductase
## 1889                 Zn-binding Pro-Ala-Ala-Arg (PAAR) domain, incolved in TypeVI secretion
##        GC
## 1880 51.6
## 1881 51.6
## 1882 52.0
## 1883 52.6
## 1884 52.7
## 1885 52.8
## 1886 52.9
## 1887 53.6
## 1888 53.6
## 1889 59.3
BAC2_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.cog.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:29 2017  --- There are 1797 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017  --- The 10 genes with the highest GC% are:
##      function_id
## 1788     COG4675
## 1789     COG0636
## 1790     COG1501
## 1791     COG0725
## 1792     COG5434
## 1793     COG2115
## 1794     COG4225
## 1795     COG3669
## 1796     COG4588
## 1797     COG3258
##                                                                               function_name
## 1788                                      Microcystin-dependent protein  (function unknown)
## 1789 FoF1-type ATP synthase, membrane subunit c/Archaeal/vacuolar-type H+-ATPase, subunit K
## 1790                                      Alpha-glucosidase, glycosyl hydrolase family GH31
## 1791                             ABC-type molybdate transport system, periplasmic component
## 1792                                                                      Polygalacturonase
## 1793                                                                       Xylose isomerase
## 1794                                                      Rhamnogalacturonyl hydrolase YesR
## 1795                                                                     Alpha-L-fucosidase
## 1796               Accessory colonization factor AcfC, contains ABC-type periplasmic domain
## 1797                                                                           Cytochrome c
##        GC
## 1788 44.5
## 1789 44.6
## 1790 44.9
## 1791 45.0
## 1792 45.1
## 1793 45.1
## 1794 45.4
## 1795 45.5
## 1796 46.6
## 1797 46.8
LIMNO_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:29 2017  --- There are 4954 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017  --- The 10 genes with the highest GC% are:
##      function_id function_name   GC
## 4945   pfam13202     EF-hand_5 79.0
## 4946   pfam16537         T2SSB 79.0
## 4947   pfam01266           DAO 79.1
## 4948   pfam03544        TonB_C 79.5
## 4949   pfam00156  Pribosyltran 79.7
## 4950   pfam11142       DUF2917 79.8
## 4951   pfam13318       DUF4089 79.8
## 4952   pfam13519         VWA_2 80.6
## 4953   pfam02120      Flg_hook 80.9
## 4954   pfam03023          MVIN 81.7
BAC1_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:29 2017  --- There are 3929 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017  --- The 10 genes with the highest GC% are:
##      function_id function_name   GC
## 3920   pfam02803    Thiolase_C 51.6
## 3921   pfam00436           SSB 52.0
## 3922   pfam00171        Aldedh 52.2
## 3923   pfam01597         GCV_H 52.6
## 3924   pfam01258  zf-dskA_traR 52.7
## 3925   pfam00137    ATP-synt_C 52.8
## 3926   pfam06108        DUF952 52.9
## 3927   pfam02566          OsmC 53.6
## 3928   pfam01058   Oxidored_q6 53.6
## 3929   pfam05488    PAAR_motif 59.3
BAC2_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:29 2017  --- There are 3573 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017  --- The 10 genes with the highest GC% are:
##      function_id   function_name   GC
## 3564   pfam13531      SBP_bac_11 46.6
## 3565   pfam13442 Cytochrome_CBB3 46.8
## 3566   pfam12779          YXWGXW 50.0
## 3567   pfam12779          YXWGXW 50.0
## 3568   pfam01391        Collagen 50.4
## 3569   pfam01391        Collagen 50.4
## 3570   pfam01391        Collagen 50.4
## 3571   pfam01391        Collagen 50.4
## 3572   pfam01391        Collagen 50.4
## 3573   pfam01391        Collagen 50.4
LIMNO_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.ko.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:30 2017  --- There are 2164 genes with > 0.1 %
## Wed Nov 15 12:08:30 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:30 2017  --- The 10 genes with the highest GC% are:
##                                                                                         function_id
## 2155                  two-component system, OmpR family, sensor histidine kinase QseC [EC:2.7.13.3]
## 2156                                                                  2'-5' RNA ligase [EC:6.5.1.-]
## 2157                                         triphosphoribosyl-dephospho-CoA synthase [EC:2.4.2.52]
## 2158                                              flagella basal body P-ring formation protein FlgA
## 2159                                                            general secretion pathway protein B
## 2160 tRNA 5-methylaminomethyl-2-thiouridine biosynthesis bifunctional protein [EC:2.1.1.61 1.5.-.-]
## 2161 tRNA 5-methylaminomethyl-2-thiouridine biosynthesis bifunctional protein [EC:2.1.1.61 1.5.-.-]
## 2162                                                4'-phosphopantetheinyl transferase [EC:2.7.8.-]
## 2163                                                     magnesium chelatase subunit D [EC:6.6.1.1]
## 2164                                                     flagellar hook-length control protein FliK
##      function_name   GC
## 2155   EC:2.7.13.3 78.5
## 2156    EC:6.5.1.- 78.5
## 2157   EC:2.4.2.52 78.6
## 2158               78.6
## 2159               79.0
## 2160      EC:1.5.- 79.1
## 2161   EC:2.1.1.61 79.1
## 2162    EC:2.7.8.- 79.1
## 2163    EC:6.6.1.1 80.6
## 2164               80.9
BAC1_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.ko.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:30 2017  --- There are 1384 genes with > 0.1 %
## Wed Nov 15 12:08:30 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:30 2017  --- The 10 genes with the highest GC% are:
##                                             function_id function_name   GC
## 1375                  single-strand DNA-binding protein               51.3
## 1376                    threonine aldolase [EC:4.1.2.5]    EC:4.1.2.5 51.3
## 1377                large subunit ribosomal protein L32               51.3
## 1378                            uncharacterized protein               51.5
## 1379                 translation initiation factor IF-2               51.5
## 1380                 small subunit ribosomal protein S2               51.6
## 1381                  single-strand DNA-binding protein               52.0
## 1382                  glycine cleavage system H protein               52.6
## 1383            F-type H+-transporting ATPase subunit c               52.8
## 1384 NADH-quinone oxidoreductase subunit B [EC:1.6.5.3]    EC:1.6.5.3 53.6
BAC2_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.ko.tab.txt", gc_thresh = 0.1, output = FALSE)
## Wed Nov 15 12:08:30 2017  --- There are 1342 genes with > 0.1 %
## Wed Nov 15 12:08:30 2017  --- This is 100 % of all genes
## Wed Nov 15 12:08:30 2017  --- The 10 genes with the highest GC% are:
##                                             function_id function_name   GC
## 1333                            uncharacterized protein               43.9
## 1334 NADH-quinone oxidoreductase subunit B [EC:1.6.5.3]    EC:1.6.5.3 44.3
## 1335            F-type H+-transporting ATPase subunit c               44.6
## 1336      alpha-D-xyloside xylohydrolase [EC:3.2.1.177]  EC:3.2.1.177 44.9
## 1337                      xylose isomerase [EC:5.3.1.5]    EC:5.3.1.5 45.1
## 1338                   alpha-L-fucosidase [EC:3.2.1.51]   EC:3.2.1.51 45.4
## 1339                            uncharacterized protein               45.5
## 1340                            uncharacterized protein               45.8
## 1341                 accessory colonization factor AcfC               46.6
## 1342             thiosulfate dehydrogenase [EC:1.8.2.2]    EC:1.8.2.2 46.8

Motivation: For COGs there exists a hierarchy allowing us to investigate whether there is a conservation of high/low %GC in certain functional gene groups. In order to do this we need to incorporate this hierarchy into the genome dataframes we have now.

4. Analysis of gene length distribution

Here we use the dataframe made in the previous section to see if there is a significant difference in the gene length of the COGs within these three consensus genomes.

Observation: They have very small genes: on average < 500bp.

# Limnohabitans MAG gene length distribution 
p_LIMNO_length <- easyGgplot2::ggplot2.histogram(data = LIMNO_gc_cog, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_limno,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Limnohabitans MAG")+ xlim(0,2000)

# Bacteroidetes MAG1 gene length distribution 
p_BAC1_length <- easyGgplot2::ggplot2.histogram(data = BAC1_gc_cog, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15, 
                  groupColors = col_bac1,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG1")+ xlim(0,2000)

# Bacteroidetes MAG2 gene length distribution 
p_BAC2_length <- easyGgplot2::ggplot2.histogram(data = BAC2_gc_cog, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_bac2,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG2")+ xlim(0,2000)

grid.arrange(p_LIMNO_length, p_BAC1_length, p_BAC2_length, ncol = 3)

We can do the same for the Pfams.

# Limnohabitans MAG gene length distribution 
p_LIMNO_length <- easyGgplot2::ggplot2.histogram(data = LIMNO_gc_pfam, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_limno,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Limnohabitans MAG")+ xlim(0,3000)

# Bacteroidetes MAG1 gene length distribution 
p_BAC1_length <- easyGgplot2::ggplot2.histogram(data = BAC1_gc_pfam, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15, 
                  groupColors = col_bac1,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG1")+ xlim(0,3000)

# Bacteroidetes MAG2 gene length distribution 
p_BAC2_length <- easyGgplot2::ggplot2.histogram(data = BAC2_gc_pfam, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_bac2,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG2")+ xlim(0,3000)

grid.arrange(p_LIMNO_length, p_BAC1_length, p_BAC2_length, ncol = 3)

5. Identify unique functional genes (COG/Pfams)

# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG1
unique_LIMNO_BAC1 <- dplyr::anti_join(LIMNO_gc_cog, BAC1_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_LIMNO_BAC1)), "unique COGs in Limnohabitans MAG vs Bacteroidetes MAG1")
## There are 1178 unique COGs in Limnohabitans MAG vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG2
unique_LIMNO_BAC2 <- dplyr::anti_join(LIMNO_gc_cog, BAC2_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_LIMNO_BAC2)), "unique COGs in Limnohabitans MAG vs Bacteroidetes MAG2")
## There are 1143 unique COGs in Limnohabitans MAG vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_BAC1_BAC2 <- dplyr::anti_join(BAC1_gc_cog, BAC2_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_BAC1_BAC2)), "unique COGs in Bacteroidetes MAG1 vs Bacteroidetes MAG2")
## There are 153 unique COGs in Bacteroidetes MAG1 vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_BAC2_BAC1 <- dplyr::anti_join(BAC2_gc_cog, BAC1_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_BAC2_BAC1)), "unique COGs in Bacteroidetes MAG2 vs Bacteroidetes MAG1")
## There are 143 unique COGs in Bacteroidetes MAG2 vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG1
unique_pfam_LIMNO_BAC1 <- dplyr::anti_join(LIMNO_gc_pfam, BAC1_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_LIMNO_BAC1)), "unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG1")
## There are 1474 unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG2
unique_pfam_LIMNO_BAC2 <- dplyr::anti_join(LIMNO_gc_pfam, BAC2_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_LIMNO_BAC2)), "unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG2")
## There are 1424 unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_pfam_BAC1_BAC2 <- dplyr::anti_join(BAC1_gc_pfam, BAC2_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_BAC1_BAC2)), "unique Pfams in Bacteroidetes MAG1 vs Bacteroidetes MAG2")
## There are 285 unique Pfams in Bacteroidetes MAG1 vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_pfam_BAC2_BAC1 <- dplyr::anti_join(BAC2_gc_pfam, BAC1_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_BAC2_BAC1)), "unique Pfams in Bacteroidetes MAG2 vs Bacteroidetes MAG1")
## There are 233 unique Pfams in Bacteroidetes MAG2 vs Bacteroidetes MAG1

6. COG functional categories

Get COG ID to COG functional category mapping file here: ftp://ftp.ncbi.nih.gov/pub/wolf/COGs/COG0303/cogs.csv

The exact statistical analysis to compare genomes based on these profiles should be performed in STAMP.

# Import COG mapping file
cogid_2_cogcat <- read.csv("./Mapping_files/cogid_2_cogcat.csv", sep = ",", header = FALSE, fill = TRUE,col.names = c("COG_ID", "COG_class", "function"))[, 1:2]
cogid_2_cogcat <- cogid_2_cogcat[(cogid_2_cogcat$COG_class)!="", ]
cogid_2_cogcat <- droplevels(cogid_2_cogcat)

# Read COG category file
cog_categories <- read.table("./Mapping_files/cog_categories.tsv", header = TRUE, sep = "\t")

# Merge COG metadata
cog_meta <- dplyr::left_join(cog_categories, cogid_2_cogcat, by = c("COG_class" = "COG_class"))
cog_meta <- droplevels(cog_meta)

# Merge genome information of all genome bins
merged_gc_cog <- rbind(LIMNO_gc_cog, BAC1_gc_cog, BAC2_gc_cog)
merged_gc_cog <- data.frame(merged_gc_cog, genome_id = c(rep("Limnohabitans MAG", nrow(LIMNO_gc_cog)), 
                                             rep("Bacteroidetes MAG1", nrow(BAC1_gc_cog)),
                                             rep("Bacteroidetes MAG2", nrow(BAC2_gc_cog)))
                            )

# Merge this metadata with the genome data from before
# COGs with multiple classifications are currently still NA - work on this.
merged_gc_cog <- dplyr::left_join(merged_gc_cog, cog_meta, by = c("cog_id" = "COG_ID"))
merged_gc_cog <- merged_gc_cog[!is.na(merged_gc_cog$COG_functional_category),]

# Visualize distribution across major metabolism functional COG groups per genome.
p_cog_func_group <- ggplot(data = merged_gc_cog, 
                           aes(x=COG_functional_category, fill = COG_functional_cluster))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  facet_grid(genome_id~.)+
  scale_fill_brewer(palette = "Accent")+
  labs(x = "Gene length (bp)", y = "Count")+ 
  theme(legend.position="bottom", axis.text.x = element_text(angle = 90, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=2,byrow=TRUE))

print(p_cog_func_group)

p_cog_func_clust <- ggplot(data = merged_gc_cog, 
                           aes(x=COG_functional_cluster, fill = COG_functional_cluster))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  facet_grid(genome_id~.)+
  scale_fill_brewer(palette = "Accent")+
  labs(x = "Gene length (bp)", y = "Count")+ 
  theme(legend.position="bottom",axis.text.x = element_text(angle = 90, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=2,byrow=TRUE))

print(p_cog_func_clust)

7. KO pathways

# Import data
ko_path_df <- read.table("./Mapping_files/ko00000.keg", header = FALSE, sep = ";",
                      skip = 3, quote = "", fill = TRUE, 
                      col.names = c("Level", "KO", "Function_abbrev", "Function_spec"))
ko_path_df <- ko_path_df[1:(nrow(ko_path_df)-1), ] # remove tailing "!" at the end of file

# Remove empty rows
ko_path_df$KO <- as.character(ko_path_df$KO);  ko_path_df$Level <- as.character( ko_path_df$Level)
ko_path_df$KO[grep("A",ko_path_df$Level)] <- ko_path_df$Level[grep("A",ko_path_df$Level)]
ko_path_df$Level[grep("A",ko_path_df$Level)] <- "A"
ko_path_df <- ko_path_df[!ko_path_df$KO == "", ]
# Extract pathways from dataframe and add them as new factor column
# pathw_hier <- data.frame(Level = ko_path_df$Level[ko_path_df$Level %in% c("A","B","C")], 
#                     Name = ko_path_df$KO[ko_path_df$Level %in% c("A","B","C")]
#                     )
# pathw_hier$Name <- as.character(pathw_hier$Name)
ko_path_df <- data.frame(ko_path_df, level_A = "A", level_B = "B", level_C = "C",
                         stringsAsFactors = FALSE)
ko_path_df$KO <- as.character(ko_path_df$KO)

# Get positions where to replicate higher hierarichcal level
pos_A <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "A"], nrow(ko_path_df)+1)
pos_B <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "B"], nrow(ko_path_df)+1)
pos_C <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "C"], nrow(ko_path_df)+1)


for(i in 1:(length(pos_A)-1)){
  ko_path_df$level_A[pos_A[i]:(pos_A[i+1]-1)] <- ko_path_df$KO[pos_A[i]]
}

for(i in 1:(length(pos_B)-1)){
  ko_path_df$level_B[pos_B[i]:(pos_B[i+1]-1)] <- ko_path_df$KO[pos_B[i]]
}

for(i in 1:(length(pos_C)-1)){
  ko_path_df$level_C[pos_C[i]:(pos_C[i+1]-1)] <- ko_path_df$KO[pos_C[i]]
}

# Remove all rows with level A, B, C - and level column
ko_path_df <- ko_path_df[!ko_path_df$Level %in% c("A","B", "C"), ]
ko_path_df$level_A <- gsub(ko_path_df$level_A, pattern = "<b>|</b>", replacement = "")
ko_path_df$level_B <- gsub(ko_path_df$level_B, pattern = "<b>|</b>", replacement = "")
ko_path_df <- ko_path_df[, -1]

# Annotate merged ko file
merged_gc_ko <- rbind(LIMNO_gc_KO, BAC1_gc_KO, BAC2_gc_KO)
merged_gc_ko <- data.frame(merged_gc_ko, 
                           genome_id = c(rep("Limnohabitans MAG", nrow(LIMNO_gc_KO)),
                                         rep("Bacteroidetes MAG1", nrow(BAC1_gc_KO)),
                                         rep("Bacteroidetes MAG2", nrow(BAC2_gc_KO)))
                            )
merged_gc_ko$ko_id <- gsub(merged_gc_ko$ko_id, pattern = "KO:", replacement = "")
merged_gc_ko <- dplyr::left_join(merged_gc_ko, ko_path_df[, c(1, 4:6)], by = c("ko_id" = "KO"))

# Fill up NA slots with "Unknown" pathway
merged_gc_ko$level_A[is.na(merged_gc_ko$level_A)] <- "Unknown"
merged_gc_ko$level_B[is.na(merged_gc_ko$level_B)] <- "Unknown"
merged_gc_ko$level_C[is.na(merged_gc_ko$level_C)] <- "Unknown"

8. Synonymous Codon Usage Bias analysis using CodonO

  • Get CodonO for linux here: http://sysbio.cvm.msstate.edu/software/CodonO/download

  • Run CU.linux input.genes.fna
  • Output from CodonO:
    inputfile.ok —- the SCUO units for each input sequence based on its order
    inputfile.fik —- the composition ratio of the i-th amino acid in the k-th sequence
    inputfile.hijk —- the frequency of the j-th degenerate codon for amino acid i in each sequence

  • This output was not sufficient to perform any analysis. Therefore I used the web browser to calculate the synonymous codon bias in the genes of the three genomes.

** Codon bias is proportional to mRNA production (Wan et al 2004)

** However, strong correlation between SCUO and %GC has been reported, thereby confounding possible “biological” effects.. Beware..

# Import and format data for Limnohabitans MAG
SCUO_LIMNO <- read.table("./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.genes.fna.codonO.output",
                         sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE
                         )
SCUO_LIMNO$V1 <- gsub(SCUO_LIMNO$V1, pattern = "\t", replacement = "")
Gene_LIMNO <- do.call(rbind, strsplit(SCUO_LIMNO$V1[grep("Ga0*", SCUO_LIMNO$V1)], " "))[,2]
SCUO_LIMNO$V1 <- gsub(SCUO_LIMNO$V1, pattern = " ", replacement = "")
SCUO_LIMNO <- data.frame(GC = SCUO_LIMNO$V1[grep(x = SCUO_LIMNO$V1, pattern = "GC*.*=")], 
                         SCUO = rep(SCUO_LIMNO$V1[grep(x = SCUO_LIMNO$V1, pattern = "SCUO*")], each = 4),
                         Gene = rep(Gene_LIMNO, each = 4)
)
SCUO_LIMNO$GC <- as.numeric(gsub(SCUO_LIMNO$GC, pattern = ".*=", replacement = ""))
SCUO_LIMNO$SCUO <- as.numeric(gsub(SCUO_LIMNO$SCUO, pattern = ".*=", replacement = ""))

# Import and format data for Bacteroidetes MAG1
SCUO_BAC1 <- read.table("./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691.genes.fna.codonO.output",
                         sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE)
SCUO_BAC1$V1 <- gsub(SCUO_BAC1$V1, pattern = "\t", replacement = "")
Gene_BAC1 <- do.call(rbind, strsplit(SCUO_BAC1$V1[grep("Ga0*", SCUO_BAC1$V1)], " "))[,2]
SCUO_BAC1$V1 <- gsub(SCUO_BAC1$V1, pattern = " ", replacement = "")
SCUO_BAC1 <- data.frame(GC = SCUO_BAC1$V1[grep(x = SCUO_BAC1$V1, pattern = "GC*.*=")], 
                         SCUO = rep(SCUO_BAC1$V1[grep(x = SCUO_BAC1$V1, pattern = "SCUO*")], each = 4),
                         Gene = rep(Gene_BAC1, each = 4)
)
SCUO_BAC1$GC <- as.numeric(gsub(SCUO_BAC1$GC, pattern = ".*=", replacement = ""))
SCUO_BAC1$SCUO <- as.numeric(gsub(SCUO_BAC1$SCUO, pattern = ".*=", replacement = ""))


# Import and format data for Bacteroidetes MAG2
SCUO_BAC2 <- read.table("./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.genes.fna.codonO.output",
                         sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE)
SCUO_BAC2$V1 <- gsub(SCUO_BAC2$V1, pattern = "\t", replacement = "")
Gene_BAC2 <- do.call(rbind, strsplit(SCUO_BAC2$V1[grep("Ga0*", SCUO_BAC2$V1)], " "))[,2]
SCUO_BAC2$V1 <- gsub(SCUO_BAC2$V1, pattern = " ", replacement = "")
SCUO_BAC2 <- data.frame(GC = SCUO_BAC2$V1[grep(x = SCUO_BAC2$V1, pattern = "GC*.*=")], 
                         SCUO = rep(SCUO_BAC2$V1[grep(x = SCUO_BAC2$V1, pattern = "SCUO*")], each = 4),
                         Gene = rep(Gene_BAC2, each = 4)
)
SCUO_BAC2$GC <- as.numeric(gsub(SCUO_BAC2$GC, pattern = ".*=", replacement = ""))
SCUO_BAC2$SCUO <- as.numeric(gsub(SCUO_BAC2$SCUO, pattern = ".*=", replacement = ""))

# Merge data to one dataframe
SCUO_merged_gen <- data.frame(rbind(SCUO_LIMNO, SCUO_BAC1, SCUO_BAC2),
           Genome_ID = c(rep("Limnohabitans MAG", nrow(SCUO_LIMNO)), rep("Bacteroidetes MAG1", nrow(SCUO_BAC1)), 
           rep("Bacteroidetes MAG2", nrow(SCUO_BAC2))),
            GCx = rep(c("GC_mean", "GC1", "GC2", "GC3"), 
                     (nrow(SCUO_LIMNO) + nrow(SCUO_BAC1) + nrow(SCUO_BAC2))/4
            )
)

# Merge codon bias data with KO pathway annotation
SCUO_merged <- dplyr::left_join(SCUO_merged_gen, merged_gc_ko[, c(1:2,4 ,14:21)], by = c("Gene" = "contig_geneID"))
          
# Visualize differences in codon bias
p_SCUO.1 <- ggplot(data = SCUO_merged, aes (x = 100*GC, y = SCUO, fill = Genome_ID))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  facet_wrap(~GCx, ncol = 2)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 0, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  ylab("SCUO")+
  xlab("%GC")+
  ylim(0,1)

print(p_SCUO.1)

# Subset to genes for which ko annotation is available
SCUO_merged_sb <- SCUO_merged[!is.na(SCUO_merged$level_A), ]
SCUO_merged_sb <- SCUO_merged_sb[SCUO_merged_sb$GCx == "GC_mean", ]
# Look at pathways enriched in high %GC
# SCUO_merged_sb[]


SCUO_merged_gen_gcmean <- SCUO_merged_gen %>% dplyr::filter(GCx == "GC_mean")

p_SCUO.2 <- ggplot(data = SCUO_merged_gen_gcmean, aes (x = Genome_ID, y = SCUO))+
  geom_jitter(size = 3, alpha = 0.3, shape = 21, aes(fill = Genome_ID))+
  geom_boxplot(alpha=0, size =1.5, color = "darkorange")+
  # scale_fill_brewer(palette = "Accent")+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Codon bias - SCUO")+
  xlab("")+
  ylim(0,1)+ 
  guides(fill=FALSE)+ 
  scale_x_discrete(labels=c("Bacteroidetes MAG1" = paste("Bacteroidetes MAG1 (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[1],")", sep = ""),
                            "Bacteroidetes MAG2" = paste("Bacteroidetes MAG2 (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[2],")", sep = ""),
                            "Limnohabitans MAG" = paste("Limnohabitans MAG (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[3],")", sep = ""))
  )
# 
print(p_SCUO.2)

tmp <- SCUO_merged_sb$genome_id
tmp2 <- cbind(SCUO_merged_sb$ko_id, 
      c(rep(col_limno, table(tmp)[3]), rep(col_bac1, table(tmp)[1]), rep(col_bac2, table(tmp)[2]))
)

write.table(tmp2, file = "All_KO.tsv", quote = FALSE,
            col.names = FALSE, row.names = FALSE)
# Import codonO results of reference genomes
ref_SCUO <- codonO_2_df(pathx = "./IMG_annotation/References/codonO_output/")

# reimport codonO results of Limno genome
Ramli_SCUO <- codonO_2_df(pathx = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/",
                          patternx = "codonO")

# Merge dataframes
ref_limno_SCUO <- rbind(ref_SCUO, Ramli_SCUO)

# Replace genome names by better annotated names
map_scuo <- read.delim("./Mapping_files/codonO_ref_names.tsv", stringsAsFactors = FALSE)
ref_limno_SCUO$Genome <- as.character(ref_limno_SCUO$Genome)
for(i in 1:nrow(map_scuo)){
  ref_limno_SCUO$Genome[ref_limno_SCUO$Genome %in% map_scuo$codon_file[i]] <- map_scuo$ref_name[i]
}
ref_limno_SCUO$Genome[ref_limno_SCUO$Genome %in% "2724679690.genes.fna.codonO.output"] <- "Ramli. sp. MAG"

# order data according to mean SCUO
# sum_scuo <- ref_limno_SCUO %>% group_by(Genome) %>% summarize(mean_scuo = mean(SCUO))
# ref_limno_SCUO$Genome <- factor(ref_limno_SCUO$Genome, levels = sum_scuo$Genome[order(sum_scuo$mean_scuo)])

ord_list_bin <- c("Lim. sp. Rim11", "Lim. sp. 103DPR2",
                  "Lim. sp. 2KL-27", "Lim. sp. Rim47",
                  "Lim. sp. II-D5", "Lim. sp. 2KL-3",
                  "Rhodo. sp. ED16","Rhodo. sp. T118",
                  "Curvi. sp. ATCC", "Curvi. sp. PAE-UM",
                  "Vario. sp. 110B", "Vario. sp. EPS",
                  "Ramli. sp. Leaf400", "Ramli. sp. TTB310",
                  "Ramli. sp. MAG", 
                  "Ramli. sp. 5-10"
                  )

# order rows and columns
ref_limno_SCUO$Genome <- factor(ref_limno_SCUO$Genome, levels = ord_list_bin)


# ref_limno_SCUO$new <- ref_limno_SCUO$Genome=="Lim. MAG (2724679690)"

# Plot SCUO profiles
p_ramli_SCUO <- ref_limno_SCUO %>% dplyr::filter(GCx == "GC_mean") %>% 
  ggplot(aes(x = Genome, y = SCUO, fill = Genome))+
  theme_bw()+
    # geom_rect(data = tp, aes(fill = new), xmin = -Inf, xmax = Inf,
            # ymin = -Inf,ymax = Inf, alpha = 0.005, show.legend =FALSE, inherit.aes = FALSE)+
  geom_violin(alpha = 0.4, adjust = 1, draw_quantiles = TRUE)+
  scale_fill_manual(values = c(rep(adjustcolor("#c8c8ff",0.8),6), rep("#f8cf94",2), 
                               rep("#adf7ad",2), rep(adjustcolor("#000000",0.21),2),
                               rep("#e2a2fd",4)))+
  theme(axis.text=element_text(size=13), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        # axis.text.x = element_text(angle = 65, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position="bottom",
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
  guides(fill=FALSE)+
  stat_summary(fun.data=mean_sdl, fun.args = list(mult = 1), 
                 geom="pointrange", color="black")+
  xlab("")+
  ylab("")+
  # geom_vline(xintercept = 9.5, col = 'black', lwd = 2, linetype = 1, alpha = 0.6)+
  ylim(0,1)

p_ramli_SCUO

p_ramli_GC <- ref_limno_SCUO  %>% dplyr::filter(GCx == "GC_mean") %>% 
  ggplot(aes(x = Genome, y = GC, fill = Genome))+
  theme_bw()+
  geom_hline(yintercept = 0.7, col = 'black', lwd = 1, linetype = 2, alpha = 0.6)+
    # geom_rect(data = tp, aes(fill = new), xmin = -Inf, xmax = Inf,
            # ymin = -Inf,ymax = Inf, alpha = 0.005, show.legend =FALSE, inherit.aes = FALSE)+
  geom_violin(alpha = 0.4, adjust = 1, draw_quantiles = TRUE)+
  scale_fill_manual(values = c(rep(adjustcolor("#c8c8ff",0.8),6), rep("#f8cf94",2), 
                               rep("#adf7ad",2), rep(adjustcolor("#000000",0.21),2),
                               rep("#e2a2fd",4)))+
  theme(axis.text=element_text(size=13), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        # axis.text.x = element_text(angle = 65, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position="bottom",
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
  guides(fill=FALSE)+
  stat_summary(fun.data=mean_sdl, fun.args = list(mult = 1), 
                 geom="pointrange", color="black")+
  xlab("")+
  ylab("")+
  scale_y_continuous(labels = function(x) sprintf("%.2f", x), breaks = seq(0.20,0.90,0.10),
                     limits = c(0.2,0.9))
  # scale_y_continuous(breaks = seq(0.20,0.90,0.10), limits = c(0.2,0.9))

p_ramli_GC

# Format file for annotation of phylogenetic tree

9. PosiGene analysis for identifying genes under positive selection in the Limnohabitans MAG

Unrooted phylogenomic tree used in codeML analysis. Green branch indicates the genome for which PSG were tested.

Phylogenetic tree used in PosiGene analysis Alternatively, the entire Ramlibacter clade can be tested for PSGs: Green branch indicates the clade for which the LCA was tested for PSGs.

PosiGene also tries to minimize false positives/negatives through additional filtering:

This filtering step can also be seen as an instrument to reduce false negatives. Few badly conserved sequences can force the first mentioned filter to delete large parts of the MSA reducing the power of the test and potentially removing positively selected sites. Third, entire MSAs can be discarded if they are considered unreliable for the following reasons, if: (i) a small absolute number or a small percentage of alignment columns or anchor species codons remain after the first filtering step, (ii) few sequences remain after the second filtering step, (iii) disproportional dN/dS ratios (e.g. ≥100 in foreground branch) were calculated by CODEML or (iv) an implausibly high fraction of positively selected sites was inferred. Additionally, MSAs will only be considered if at least one species from the sister taxon (i.e. the most closely-related species/clade) of the examined branch is represented in it. Without this condition it is not possible to say whether potentially observed selective pressure worked on the branch of interest or before in evolution .

# Import results file of genome-specific PSGs
data_posi <- read.table("./posigene_analysis/result_tables/Ramlibacter_MAG_results.tsv", header = TRUE, fill = TRUE, sep = "\t")[, c("Transcript", "P.Value","FDR", "HA.foreground.omega", 
                                                                                                                                     "Number.of.Sites.under.positive.Selection")]
# Import results file of Ramlibacter clade-specific PSGs
data_posi_clade <- read.table("./posigene_analysis/result_tables_clade/Ramlibacter_MAG_results.tsv", header = TRUE, fill = TRUE, sep = "\t")[, c("Transcript", "P.Value","FDR", "HA.foreground.omega", 
                                                                                                                                     "Number.of.Sites.under.positive.Selection")]


colnames(data_posi)[1] <- "Gene"; colnames(data_posi_clade)[1] <- "Gene"
data_posi$Gene <- as.character(data_posi$Gene)
data_posi_clade$Gene <- as.character(data_posi_clade$Gene)

# Import mapping file to link gene IDs from PosiGene to 
# those used by the IMG annotation (both are in headers from .genes.fna)
map_posi <- read.table("./Mapping_files/posiG_mapping.tsv")
colnames(map_posi) <- c("posi_geneID", "IMG_geneID")
map_posi$posi_geneID <- as.character(map_posi$posi_geneID)

# Filter out the genes that are under positive selection
# Taking threshold of adjusted p.value of 0.05 and FDR < 0.05
# Also remove dN/dS ratios of less than 15.
data_posi <- data_posi %>% filter(P.Value < 0.05 & 
                                    FDR < 0.05 & HA.foreground.omega < 10)
data_posi_clade <- data_posi_clade %>% filter(P.Value < 0.05 & 
                                    FDR < 0.05 & HA.foreground.omega < 10)
# data_posi <- data_posi %>% filter(P.Value < 0.05 & 
#                                     FDR < 0.05 & HA.foreground.omega < 10 &
#                                     Number.of.Sites.under.positive.Selection > 10)
# Report summary
cat(paste("There are ", nrow(data_posi), " genes under positive selection in this MAG (P<0.05). This is ",round(100*nrow(data_posi)/nrow(map_posi),1), "% of all genes",  sep = "")
)
## There are 415 genes under positive selection in this MAG (P<0.05). This is 10.8% of all genes
cat(paste("There are ", nrow(data_posi_clade), " genes under positive selection in the Ramlibacter clade (P<0.05). This is ", round(100*nrow(data_posi_clade)/nrow(map_posi),1), "% of all genes in the Ramlibacter sp. MAG which was used as reference/anchor species",  sep = "")
)
## There are 184 genes under positive selection in the Ramlibacter clade (P<0.05). This is 4.8% of all genes in the Ramlibacter sp. MAG which was used as reference/anchor species
# Merge this data with the functional annotation (e.g. KO) of these genes
data_posi <- left_join(data_posi, map_posi, by = c("Gene" = "posi_geneID"))
data_posi_clade <- left_join(data_posi_clade, map_posi, by = c("Gene" = "posi_geneID"))
data_posi_KO <- left_join(data_posi, merged_gc_ko, by = c("IMG_geneID" = "contig_geneID"))
data_posi_clade_KO <- left_join(data_posi_clade, merged_gc_ko, by = c("IMG_geneID" = "contig_geneID"))

# Retain clade or MAG only genes in the respective dataframes
pos <- !data_posi_KO$Gene %in% data_posi_clade_KO$Gene
pos2 <- !data_posi_clade_KO$Gene %in% data_posi_KO$Gene
data_posi_KO <- data_posi_KO[pos, ]
data_posi_clade_KO <- data_posi_clade_KO[pos2, ]

# Optional: write table for quick view in iPath v2
write.table(file = "KO_posiG.tsv", unique(data_posi_KO$ko_id), quote = FALSE,
            row.names = FALSE, col.names = FALSE)
write.table(file = "KO_posiG_clade.tsv", unique(data_posi_clade_KO$ko_id), quote = FALSE,
            row.names = FALSE, col.names = FALSE)
# cat("Genes under positive selection with KO annotation")

# Merge dataframes to plot

# Remove levels without "n_level" number of genes
n_level <- round(0.01*sum(table(data_posi_clade_KO$level_C)),0)
posiG_p_df_clade  <- table(data_posi_clade_KO$level_C)[table(data_posi_clade_KO$level_C)>n_level]
posiG_p_df_clade <- data.frame(posiG_p_df_clade); posiG_p_df_clade$Var1 <- as.character(posiG_p_df_clade$Var1)

n_level <- round(0.01*sum(table(data_posi_KO$level_C)),0)
posiG_p_df_MAG  <- table(data_posi_KO$level_C)[table(data_posi_KO$level_C)>n_level]
posiG_p_df_MAG <- data.frame(posiG_p_df_MAG); posiG_p_df_MAG$Var1 <- as.character(posiG_p_df_MAG$Var1)

# Merge dataframes
posiG_p_df_merged <- data.frame(rbind(posiG_p_df_clade, posiG_p_df_MAG),
                                branch = c(rep("clade", nrow(posiG_p_df_clade)), 
                                           rep("MAG", nrow(posiG_p_df_MAG))))
data_posi_KO_merge <- rbind(data_posi_KO, data_posi_clade_KO)

# Merge with level B annotation
posiG_p_df_merged <- left_join(posiG_p_df_merged, data_posi_KO_merge[, c("level_A","level_B","level_C")],
                       by = c("Var1" = "level_C")) %>% distinct()
posiG_p_df_merged$level_B <- substring(posiG_p_df_merged$level_B, 7)
posiG_p_df_merged$level_A <- substring(posiG_p_df_merged$level_A, 7)
posiG_p_df_merged$Var1 <- substring(posiG_p_df_merged$Var1, 7)
posiG_p_df_merged$Var1 <- gsub("\\[[^\\]]*\\]", "", posiG_p_df_merged$Var1 , perl=TRUE)
posiG_p_df_merged$level_B[posiG_p_df_merged$level_B == "Cellular community - prokaryotes"] <- "Biofilm formation & quorum sensing"

# Sort according to frequency
posiG_p_df_merged$Var1 <- factor(posiG_p_df_merged$Var1, levels = unique(posiG_p_df_merged$Var1[rev(order(posiG_p_df_merged$Freq))]))

posiG_p_df_merged$level_B <- factor(posiG_p_df_merged$level_B, levels = unique(posiG_p_df_merged$level_B[rev(order(posiG_p_df_merged$Freq))]))

# Add extra column to shorten number of labels in legend
# Only show 12 most frequent categories
posiG_p_df_merged <- posiG_p_df_merged %>% mutate(level_C_short = 
                                                    Var1)
tmp <- levels(posiG_p_df_merged$Var1)[1:11]
posiG_p_df_merged$level_C_short <- as.character(posiG_p_df_merged$level_C_short)
posiG_p_df_merged$level_C_short[!posiG_p_df_merged$level_C_short %in% tmp] <- "Other"
posiG_p_df_merged$level_C_short <- factor(posiG_p_df_merged$level_C_short,
                                          levels = c(tmp,"Other"))
# Make plot
p_KO_posi <- ggplot(posiG_p_df_merged, aes(x = level_B, y = Freq, fill = level_C_short))+
  geom_bar(stat="identity", color = "black")+
  theme_bw()+
  scale_fill_brewer(palette="Paired")+
  ggtitle("Number of genes")+
  ylab("") + xlab("")+
  facet_grid(branch~.)+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(18),
        title=element_text(size=18), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 65, hjust = 1),
        strip.text=element_text(size=18),
        plot.margin = unit(c(1,1,1,1), "cm"), legend.title = element_blank()
        # ,legend.position = c(0.87, 0.85)
        )

print(p_KO_posi)

# Correlate genes under positive selection with their GC content
data_SCUO_posi <- SCUO_merged_gen_gcmean %>% filter(Genome_ID == "Limnohabitans MAG")
data_SCUO_posi <- droplevels(data_SCUO_posi)
data_SCUO_posi$posi_select <- data_SCUO_posi$Gene %in% data_posi$IMG_geneID
data_SCUO_posi$posi_select <- factor(data_SCUO_posi$posi_select, levels = c("FALSE", "TRUE"))
# merge with dN/dS ratio data
data_SCUO_posi <- left_join(data_SCUO_posi,
                            data_posi, by = c("Gene" = "IMG_geneID"))
data_SCUO_posi_only <- data_SCUO_posi %>% filter(posi_select == "TRUE")

# Selected annotation level (KO)
selected_KOlevels <- c("Signal transduction",
"Amino acid metabolism",
"Metabolism of other amino acids",
"Metabolism of cofactors and vitamins",
"Carbohydrate metabolism",
"Energy metabolism",
"Translation",
"Membrane transport",
"Biosynthesis of other secondary metabolites",
"Lipid metabolism",
"Membrane transport",
"Unknown"
)

# Extract level_C labels of genes in top 5 dN/ds ratio
# First filter out the genes with multiple annotations by
# selecting the annotation with the highest identity.
# data_posi_KO_unique <- data_posi_KO %>% group_by(Gene) %>%
  # filter(percent_identity == max(percent_identity))
  
top <- 10
thresh <- sort(data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$level_C
selected_KOlevels_labels[data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""

# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *", 
                                 replacement = "", selected_KOlevels_labels)
# Remove everything before first space
selected_KOlevels_labels <- sub(".*? (.+)", "\\1", selected_KOlevels_labels)

p_SCUO_posi1 <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ] %>% 
  ggplot(aes(x = level_B, y = HA.foreground.omega,
                                                   fill = level_B))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
               outlier.shape = NA)+
  scale_fill_brewer(palette = "Set3")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 60, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  xlab("")+ 
  ylab(expression(dN/dS))+
  ylim(0,20)+
  guides(size = FALSE, fill = FALSE)+
  geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = "#3F4656",
                   size = 3,
                       # Width of the line segments.
                   segment.size = 1,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
                   nudge_x = -0.1,
                   nudge_y = 0.6,
                   force = 10
                   
  )

p_SCUO_posi1

# Extract level_C labels of genes in top 5 dN/ds ratio
top <- 10
thresh <- sort(data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$ko_name
selected_KOlevels_labels[data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""

# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *", 
                                 replacement = "", selected_KOlevels_labels)

p_SCUO_posi2 <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ] %>% 
  ggplot(aes(x = level_B, y = HA.foreground.omega,
                                                   fill = level_B))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
               outlier.shape = NA)+
  scale_fill_brewer(palette = "Set3")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 60, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  xlab("")+ 
  ylab(expression(dN/dS))+
  ylim(0,20)+
  guides(size = FALSE, fill = FALSE)+
  geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = "#3F4656",
                   size = 3,
                       # Width of the line segments.
                   segment.size = 1,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
                   nudge_x = -0.1,
                   nudge_y = 0.6,
                   force = 10
                   
  )

p_SCUO_posi2

# Do the same thing for the Ramlibacter clade-level PSGs
top <- 10
thresh <- sort(data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$level_C
selected_KOlevels_labels[data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""

# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *", 
                                 replacement = "", selected_KOlevels_labels)
# Remove everything before first space
selected_KOlevels_labels <- sub(".*? (.+)", "\\1", selected_KOlevels_labels)

p_SCUO_posi3 <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ] %>% 
  ggplot(aes(x = level_B, y = HA.foreground.omega,
                                                   fill = level_B))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
               outlier.shape = NA)+
  scale_fill_brewer(palette = "Set3")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 60, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  xlab("")+ 
  ylab(expression(dN/dS))+
  ylim(0,20)+
  guides(size = FALSE, fill = FALSE)+
  geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = "#3F4656",
                   size = 3,
                       # Width of the line segments.
                   segment.size = 1,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
                   nudge_x = -0.1,
                   nudge_y = 0.6,
                   force = 10
                   
  )

p_SCUO_posi3

# Extract level_C labels of genes in top 5 dN/ds ratio
top <- 10
thresh <- sort(data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$ko_name
selected_KOlevels_labels[data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""

# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *", 
                                 replacement = "", selected_KOlevels_labels)

p_SCUO_posi4 <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ] %>% 
  ggplot(aes(x = level_B, y = HA.foreground.omega,
                                                   fill = level_B))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
               outlier.shape = NA)+
  scale_fill_brewer(palette = "Set3")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 60, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  xlab("")+ 
  ylab(expression(dN/dS))+
  ylim(0,20)+
  guides(size = FALSE, fill = FALSE)+
  geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = "#3F4656",
                   size = 3,
                       # Width of the line segments.
                   segment.size = 1,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
                   nudge_x = -0.1,
                   nudge_y = 0.6,
                   force = 10
                   
  )

p_SCUO_posi4

Pangenome analysis

  • Genomes were annotated with COG ids through anvi-run-ncbi-cogs by blast searches.
  • Pangenome was made using the --use-ncbif-blast flag as recommended by the developers.
  • Summary of panG protein clusters were manually selected in the interactive interface.
  • The amino acid sequences in the protein cluster of the Ramlibacter MAG were further annotated with the KEGG orthology.
for gene in `cat gene_list_pan.tsv`; do
    anvi-get-dna-sequences-for-gene-calls -c 121950_assembled.db --gene-caller-ids $gene -o genes_tmp.fa
    cat genes_tmp.fa >> genes_pan.fa
done
Pangenome visualized in anvio

Pangenome visualized in anvio

panG <- read.table("./panG/SUMMARY_Ramli_PCs/panG-ramli_protein_clusters_summary.txt", header = TRUE, fill = TRUE, sep = "\t")[ , c("bin_name", "genome_name",  "gene_callers_id",  "COG_CATEGORY_ACC", "COG_CATEGORY", "COG_FUNCTION_ACC", "COG_FUNCTION", "aa_sequence")]
panG$aa_sequence <- gsub("-", "", panG$aa_sequence)
# panG_MAG <- panG %>% filter(bin_name %in% c("MAG_PC", "Ramli_5-10_PC", 
#                                           "Ramli_Leaf400_PC", "Ramli_TTB310_PC"))

write.table(paste(unique(panG_MAG$COG_FUNCTION_ACC), "W10", "#e2a2fd", sep=" "), "./panG/cog_id_panG.txt", row.names = FALSE,
            quote = FALSE, col.names = FALSE)
## Error in unique(panG_MAG$COG_FUNCTION_ACC): object 'panG_MAG' not found
write.table(paste(">", panG_MAG$gene_callers_id,
                  "\n", panG_MAG$aa_sequence, sep = ""), 
            "./panG/aaSeq_panG.fa", row.names = FALSE,
            quote = FALSE, col.names = FALSE)
## Error in paste(">", panG_MAG$gene_callers_id, "\n", panG_MAG$aa_sequence, : object 'panG_MAG' not found
write.table(unique(panG_ko_cog$gene_callers_id), 
            "./panG/gene_ids_pan.tsv", row.names = FALSE,
            quote = FALSE, col.names = FALSE)
## Error in unique(panG_ko_cog$gene_callers_id): object 'panG_ko_cog' not found
# Import KEGG annotation through KAAS (http://www.genome.jp/tools/kaas/) of amino
# acid sequences
panG_ko <- read.delim("./panG/panG_PC_MAG_KO-annotation.tsv")
panG_ko <- panG_ko[panG_ko$ko_id != "",]
panG_ko$ko_id <- gsub(" ","", panG_ko$ko_id)

# Annotate KO_IDs with hierarchy
panG_ko <- dplyr::left_join(panG_ko, ko_path_df, by = c("ko_id" = "KO"))

# join with corresponding COG ids
panG_ko_cog <- dplyr::left_join(panG_MAG, panG_ko, by = c("gene_callers_id" = "gene_id"))
## Error in dplyr::left_join(panG_MAG, panG_ko, by = c(gene_callers_id = "gene_id")): object 'panG_MAG' not found
panG_ko_cog$level_B <- substring(panG_ko_cog$level_B, 7)
## Error in substring(panG_ko_cog$level_B, 7): object 'panG_ko_cog' not found
panG_ko_cog$level_C <- substring(panG_ko_cog$level_C, 7)
## Error in substring(panG_ko_cog$level_C, 7): object 'panG_ko_cog' not found
panG_ko_cog$level_C <- gsub("\\[[^\\]]*\\]", "", panG_ko_cog$level_C , perl=TRUE)
## Error in gsub("\\[[^\\]]*\\]", "", panG_ko_cog$level_C, perl = TRUE): object 'panG_ko_cog' not found
# Shorten/change level_B annotation a bit
panG_ko_cog$level_B[panG_ko_cog$level_B == "Cellular community - prokaryotes"] <- "Biofilm formation & quorum sensing"
## Error in panG_ko_cog$level_B[panG_ko_cog$level_B == "Cellular community - prokaryotes"] <- "Biofilm formation & quorum sensing": object 'panG_ko_cog' not found
panG_ko_cog$level_B[panG_ko_cog$level_B == "Xenobiotics biodegradation and metabolism"] <- "Xenobiotics degradation"
## Error in panG_ko_cog$level_B[panG_ko_cog$level_B == "Xenobiotics biodegradation and metabolism"] <- "Xenobiotics degradation": object 'panG_ko_cog' not found
panG_ko_cog$level_C[panG_ko_cog$level_C == "Biofilm formation - Escherichia coli "] <- "Biofilm formation"
## Error in panG_ko_cog$level_C[panG_ko_cog$level_C == "Biofilm formation - Escherichia coli "] <- "Biofilm formation": object 'panG_ko_cog' not found
panG_ko_cog$level_C[panG_ko_cog$level_C == "Biofilm formation - Pseudomonas aeruginosa "] <- "Xenobiotics degradation"
## Error in panG_ko_cog$level_C[panG_ko_cog$level_C == "Biofilm formation - Pseudomonas aeruginosa "] <- "Xenobiotics degradation": object 'panG_ko_cog' not found
# Remove levels without 10 genes
panG_p_df  <- table(panG_ko_cog$level_C)[table(panG_ko_cog$level_C)>5]
## Error in table(panG_ko_cog$level_C): object 'panG_ko_cog' not found
panG_p_df <- data.frame(panG_p_df); panG_p_df$Var1 <- as.character(panG_p_df$Var1)
## Error in data.frame(panG_p_df): object 'panG_p_df' not found
## Error in eval(expr, envir, enclos): object 'panG_p_df' not found
# Merge with level B annotation
panG_p_df <- left_join(panG_p_df, panG_ko_cog[, c("level_B","level_C")],
                       by = c("Var1" = "level_C")) %>% distinct()
## Error in left_join(panG_p_df, panG_ko_cog[, c("level_B", "level_C")], : object 'panG_p_df' not found
panG_p_df <- panG_p_df[rev(order(panG_p_df$Freq)),]
## Error in eval(expr, envir, enclos): object 'panG_p_df' not found
panG_p_df$Var1 <- factor(panG_p_df$Var1[rev(order(panG_p_df$Freq))], 
                         levels = panG_p_df$Var1[rev(order(panG_p_df$Freq))])
## Error in factor(panG_p_df$Var1[rev(order(panG_p_df$Freq))], levels = panG_p_df$Var1[rev(order(panG_p_df$Freq))]): object 'panG_p_df' not found
panG_p_df$Freq <- panG_p_df$Freq/length(unique(panG_ko_cog$gene_callers_id))
## Error in eval(expr, envir, enclos): object 'panG_p_df' not found
# Plot distribution of panG annotation
p_panG1 <- ggplot(panG_p_df, aes(x = Var1, y = 100*Freq, fill = level_B))+
  geom_bar(stat="identity", color = "black")+
  theme_bw()+
  scale_fill_brewer(palette="Paired")+
  ggtitle("Proportion of genes in pangenome (%)")+
  ylab("") + xlab("")+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(18),
        title=element_text(size=18), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18),
        plot.margin = unit(c(1,1,1,1), "cm"), legend.title = element_blank()
        # ,legend.position = c(0.87, 0.85)
        )
## Error in ggplot(panG_p_df, aes(x = Var1, y = 100 * Freq, fill = level_B)): object 'panG_p_df' not found
print(p_panG1)
## Error in print(p_panG1): object 'p_panG1' not found
# Now select the genes that are PSGs
blast_panG <- read.delim("./panG/genes_pan.blast", header = FALSE)
colnames(blast_panG) <- c("qseqid", "sseqid", "pident", "length", "mismatch",
                          "gapopen", "qstart", "qend", "sstart", "send", 
                          "evalue", "bitscore")

blast_panG$qseqid <- do.call(rbind, strsplit(as.character(blast_panG$qseqid), split = "|", fixed = TRUE))[,1]

10. ANI analysis using pyani

# read file
# data.ANI <- read.table("./ANI/ANIb_percentage_identity.tab")
data.ANI <- read.table("./ANI/ANIb_percentage_identity.tab")


# Replace genome names by better annotated names
map_ani <- read.delim("./Mapping_files/pyani_ref_names.tsv", stringsAsFactors = FALSE)
for(i in 1:nrow(map_ani)){
 colnames(data.ANI)[colnames(data.ANI) %in% map_ani$ani_file[i]] <- map_ani$ref_name[i]
 rownames(data.ANI)[rownames(data.ANI) %in% map_ani$ani_file[i]] <- map_ani$ref_name[i]
}

# Order y-axis according to phylogenetic tree order
ord_list_bin <- c("Lim. sp. Rim11", "Lim. sp. 103DPR2",
                  "Lim. sp. 2KL-27", "Lim. sp. Rim47",
                  "Lim. sp. II-D5", "Lim. sp. 2KL-3",
                  "Rhodo. sp. ED16","Rhodo. sp. T118",
                  "Curvi. sp. ATCC", "Curvi. sp. PAE-UM",
                  "Vario. sp. 110B", "Vario. sp. EPS",
                  "Ramli. sp. Leaf400", "Ramli. sp. TTB310",
                  "Ramli. sp. MAG", 
                  "Ramli. sp. 5-10"
                  )

# order rows and columns
data.ANI <- data.ANI[, order(match(colnames(data.ANI), ord_list_bin))]
data.ANI <- data.ANI[order(match(rownames(data.ANI), ord_list_bin)), ]

data.ANI <- get_upper_tri(data.ANI)

# melt to dataframe
df_pyani <- melt(as.matrix(data.ANI), na.rm = TRUE) # reshape into dataframe
names(df_pyani)[c(1:3)] <- c("bin1", "bin2", "ANI")
df_pyani[, 1] <- as.character(df_pyani[, 1]); df_pyani[, 2] <- as.character(df_pyani[, 2])
df_pyani$bin2 <- factor(df_pyani$bin2, levels = ord_list_bin)
df_pyani$bin1 <- factor(df_pyani$bin1, levels = rev(ord_list_bin))

# make plot
p_ani <- ggplot(data = df_pyani,
       aes(x = bin2, y = bin1, fill = ANI))+
  theme(axis.title=element_text(size=16), strip.text.x=element_text(size=16),
        legend.title=element_text(size=15), legend.text=element_text(size=14),
        axis.text.y = element_blank(),
        axis.text.x = element_text(size=16, angle = 55, hjust = 0),
        title=element_text(size=20),
        plot.margin = unit(c(1.1,1.1,1.1,1.1), "cm"),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "transparent",colour = NA),
        plot.background = element_rect(fill = "transparent",colour = NA)
  )+ 
  geom_raster()+
  scale_fill_distiller(palette = "RdBu", name = "Average\nNucleotide\nIdentity\n",
                       limits = c(0.75,1.0), oob=squish) +
  geom_text(aes(label = round(ANI, 2)), size = 4.5)+
  xlab("")+
  ylab("")+
  scale_x_discrete(position = "top") 

print(p_ani)

Plasmid reconstruction using Recycler

files <- list.files("./recycler_plasmid/", pattern="*_length_cov")
files <- paste0("./recycler_plasmid/", files)
df_plasm_16 <- read.table(files[1], header = FALSE)
df_plasm_17 <- read.table(files[2], header = FALSE)
df_plasm_64 <- read.table(files[3], header = FALSE)
df_plasm_65 <- read.table(files[4], header = FALSE)
df_plasm <- rbind(df_plasm_16, df_plasm_17, df_plasm_64, df_plasm_65)

df_plasm <- as.character(df_plasm$V1); tmp <- df_plasm
df_plasm <- do.call(rbind, strsplit(df_plasm, "_"))
df_plasm <- df_plasm[, -c(1,3,5,7,8)]
colnames(df_plasm) <- c("NODE_ID", "Length", "avg_coverage")
df_plasm <- apply(df_plasm, 2, function(x) as.integer(x))
df_plasm <- data.frame(df_plasm, node_IDs = tmp)

# Filter out plasmid scaffolds lower than 1000 bp
p_plasm1 <- df_plasm %>% filter(Length > 1000) %>% 
  ggplot(aes(x = Length/1000, y = sqrt(avg_coverage), fill = Length))+
  geom_point(shape = 21, size = 4)+ theme_bw()+
  # scale_x_log10()+
  # scale_y_log10()+
  scale_fill_continuous()+
  ylab(expression(sqrt(Average_coverage)))+
  xlab("Length (kbp)")+
  ggtitle("Coverage-Length distribution of reconstructed plasmid fragments")

p_plasm1

11. Gene set enrichment analysis

library("GSAR")